import lxml.etree as le

# with open('meiju1.html','r',encoding='utf-8') as f:
#     html=f.read()
#     # print(html)
#     html_x=le.HTML(html)
#     # print(html_x)
#     title_s=html_x.xpath('//div[contains(@class,"threadlist_title pull_left j_th_tit")]/a/text()')
#     data_s=[]
#     # print(title_s)
#     for title in title_s:
#         print(title)


import re
# meiju2.html由于被注释掉，网页访问时通过js将注释去掉正常访问，此时xpath只能提取没有被注释的代码,所以需要使用正则表达式来完成提取
with open('meiju2.html','r',encoding='utf-8') as f:
    html=f.read()
    # print(html)
    html=re.sub('\n','',html)
    # print(html)
    title_pattern='<div class="threadlist_title pull_left j_th_tit ">.*?<a.*?>(.*?)</a>'
    titles=re.findall(title_pattern,html)
    # print(titles)
    for title in titles:
        print(title)